Set up

Here, we’re just setting a few options.

knitr::opts_chunk$set(
  warning = TRUE, # show warnings during codebook generation
  message = TRUE, # show messages during codebook generation
  error = TRUE, # do not interrupt codebook generation in case of errors,
                # usually better for debugging
  echo = TRUE  # show R code
)
if(!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load('tidyverse', 'codebook')
ggplot2::theme_set(ggplot2::theme_bw())

Now, we’re preparing our data for the codebook.

# Code book
codebook_data <- read_tsv("/Users/junyichu/Dropbox (MIT)/2 NOW/2018 Physics/Videos for QI/prephys_split0_videos.tsv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   studyID = col_character(),
##   childID = col_character(),
##   videoID = col_character(),
##   n_coders = col_double(),
##   videoFileName = col_character(),
##   codingFile1 = col_character(),
##   codingFile2 = col_character(),
##   codingFile3 = col_character(),
##   child.ageSessionRounded = col_double(),
##   child.gender = col_character(),
##   parent.race.nonwhite = col_character(),
##   video.privacy = col_character(),
##   consentnotes = col_character(),
##   usable = col_character(),
##   video.nTrialsExpected = col_double(),
##   video.nTrialsFound = col_double(),
##   video.expectedDuration = col_double(),
##   video.actualDuration = col_double(),
##   which.dataset = col_character()
## )
# Meta data
metadata(codebook_data)$name <- "Preferential Physics on Lookit dataset"
metadata(codebook_data)$description <- "The annotated and shareable subset of video, annotation, and participant data"
metadata(codebook_data)$datePublished <- "2022-01-14"
metadata(codebook_data)$creator <- list(
      "@type" = "Person",
      givenName = "Junyi", familyName = "Chu",
      email = "jchu@mit.edu", 
      affiliation = list("@type" = "Organization",
        name = "MIT"))

Label variables

var_label(codebook_data) <- list(
        studyID = "Study version identifier", 
        childID = "Child identifier",
        videoID = "Video identifier",
        n_coders = "Number of human annotations available for the session",
        videoFileName = "Video file name",
        codingFile1 = "File name of annotation by primary coder",
        codingFile2 = "File name of annotation by second coder",
        codingFile3 = "File name of annotation by third coder",
        child.ageSessionRounded = "Child age in months (floor of exact age)",
        child.gender = "Child gender",
        parent.race.nonwhite = "Whether parent reported a non-white race",
        video.privacy = "Video sharing and storage permissions",
        consentnotes = "Any notes from parent consent process",
        usable = "Any notes from video download process on video quality",
        video.nTrialsExpected = "Number of trials expected",
        video.nTrialsFound = "Number of trials in video",
        video.expectedDuration = "Expected video duration in seconds",
        video.actualDuration = "Actual video duration in seconds",
        which.dataset = "Specifies video assignment to training,validation and test sets"
)

Codebook

# store data and metadata
rio::export(codebook_data, "physics-for-qi.rds") # to R data structure file

Now, generating a codebook is as simple as calling codebook from a chunk in an rmarkdown document.

codebook(codebook_data)

Metadata

Description

Dataset name: Preferential Physics on Lookit dataset

The annotated and shareable subset of video, annotation, and participant data

Metadata for search engines
  • Date published: 2022-01-14

  • Creator:

name value
@type Person
givenName Junyi
familyName Chu
email
affiliation Organization, MIT
x
studyID
childID
videoID
n_coders
videoFileName
codingFile1
codingFile2
codingFile3
child.ageSessionRounded
child.gender
parent.race.nonwhite
video.privacy
consentnotes
usable
video.nTrialsExpected
video.nTrialsFound
video.expectedDuration
video.actualDuration
which.dataset

#Variables

studyID

Study version identifier

Distribution

Distribution of values for studyID

Distribution of values for studyID

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
studyID Study version identifier character 0 1 2 0 24 36 0

childID

Child identifier

Distribution

Distribution of values for childID

Distribution of values for childID

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
childID Child identifier character 0 1 73 0 5 36 0

videoID

Video identifier

Distribution

Distribution of values for videoID

Distribution of values for videoID

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
videoID Video identifier character 0 1 225 0 24 36 0

n_coders

Number of human annotations available for the session

Distribution

Distribution of values for n_coders

Distribution of values for n_coders

0 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
n_coders Number of human annotations available for the session numeric 0 1 1 1 3 1.302222 0.5237608 ▇▁▂▁▁

videoFileName

Video file name

Distribution

Distribution of values for videoFileName

Distribution of values for videoFileName

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
videoFileName Video file name character 0 1 225 0 98 157 0

codingFile1

File name of annotation by primary coder

Distribution

Distribution of values for codingFile1

Distribution of values for codingFile1

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
codingFile1 File name of annotation by primary coder character 0 1 225 0 101 154 0

codingFile2

File name of annotation by second coder

Distribution

Distribution of values for codingFile2

Distribution of values for codingFile2

164 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
codingFile2 File name of annotation by second coder character 164 0.2711111 61 0 153 154 0

codingFile3

File name of annotation by third coder

Distribution

Distribution of values for codingFile3

Distribution of values for codingFile3

218 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
codingFile3 File name of annotation by third coder character 218 0.0311111 7 0 153 154 0

child.ageSessionRounded

Child age in months (floor of exact age)

Distribution

Distribution of values for child.ageSessionRounded

Distribution of values for child.ageSessionRounded

0 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
child.ageSessionRounded Child age in months (floor of exact age) numeric 0 1 4 8 14 8.386667 2.273057 ▆▇▆▅▁

child.gender

Child gender

Distribution

Distribution of values for child.gender

Distribution of values for child.gender

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
child.gender Child gender character 0 1 2 0 1 1 0

parent.race.nonwhite

Whether parent reported a non-white race

Distribution

Distribution of values for parent.race.nonwhite

Distribution of values for parent.race.nonwhite

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
parent.race.nonwhite Whether parent reported a non-white race character 0 1 2 0 10 18 0

video.privacy

Video sharing and storage permissions

Distribution

Distribution of values for video.privacy

Distribution of values for video.privacy

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
video.privacy Video sharing and storage permissions character 0 1 4 0 12 20 0

consentnotes

Any notes from parent consent process

Distribution

Distribution of values for consentnotes

Distribution of values for consentnotes

197 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
consentnotes Any notes from parent consent process character 197 0.1244444 13 0 13 102 0

usable

Any notes from video download process on video quality

Distribution

Distribution of values for usable

Distribution of values for usable

39 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
usable Any notes from video download process on video quality character 39 0.8266667 1 0 3 3 0

video.nTrialsExpected

Number of trials expected

Distribution

Distribution of values for video.nTrialsExpected

Distribution of values for video.nTrialsExpected

0 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
video.nTrialsExpected Number of trials expected numeric 0 1 8 25 26 23.96889 3.469756 ▁▁▁▁▇

video.nTrialsFound

Number of trials in video

Distribution

Distribution of values for video.nTrialsFound

Distribution of values for video.nTrialsFound

0 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
video.nTrialsFound Number of trials in video numeric 0 1 8 25 26 23.65333 3.829491 ▁▁▁▁▇

video.expectedDuration

Expected video duration in seconds

Distribution

Distribution of values for video.expectedDuration

Distribution of values for video.expectedDuration

0 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
video.expectedDuration Expected video duration in seconds numeric 0 1 159 574 756 527.5628 99.69111 ▁▁▂▇▁

video.actualDuration

Actual video duration in seconds

Distribution

Distribution of values for video.actualDuration

Distribution of values for video.actualDuration

0 missing values.

Summary statistics

name label data_type n_missing complete_rate min median max mean sd hist
video.actualDuration Actual video duration in seconds numeric 0 1 159 574 756 527.5708 99.6957 ▁▁▂▇▁

which.dataset

Specifies video assignment to training,validation and test sets

Distribution

Distribution of values for which.dataset

Distribution of values for which.dataset

0 missing values.

Summary statistics

name label data_type n_missing complete_rate n_unique empty min max whitespace
which.dataset Specifies video assignment to training,validation and test sets character 0 1 3 0 6 10 0

Missingness report

Codebook table

JSON-LD metadata

The following JSON-LD can be found by search engines, if you share this codebook publicly on the web.

{
  "name": "Preferential Physics on Lookit dataset",
  "description": "The annotated and shareable subset of video, annotation, and participant data\n\n\n## Table of variables\nThis table contains variable names, labels, and number of missing values.\nSee the complete codebook for more.\n\n|name                    |label                                                           | n_missing|\n|:-----------------------|:---------------------------------------------------------------|---------:|\n|studyID                 |Study version identifier                                        |         0|\n|childID                 |Child identifier                                                |         0|\n|videoID                 |Video identifier                                                |         0|\n|n_coders                |Number of human annotations available for the session           |         0|\n|videoFileName           |Video file name                                                 |         0|\n|codingFile1             |File name of annotation by primary coder                        |         0|\n|codingFile2             |File name of annotation by second coder                         |       164|\n|codingFile3             |File name of annotation by third coder                          |       218|\n|child.ageSessionRounded |Child age in months (floor of exact age)                        |         0|\n|child.gender            |Child gender                                                    |         0|\n|parent.race.nonwhite    |Whether parent reported a non-white race                        |         0|\n|video.privacy           |Video sharing and storage permissions                           |         0|\n|consentnotes            |Any notes from parent consent process                           |       197|\n|usable                  |Any notes from video download process on video quality          |        39|\n|video.nTrialsExpected   |Number of trials expected                                       |         0|\n|video.nTrialsFound      |Number of trials in video                                       |         0|\n|video.expectedDuration  |Expected video duration in seconds                              |         0|\n|video.actualDuration    |Actual video duration in seconds                                |         0|\n|which.dataset           |Specifies video assignment to training,validation and test sets |         0|\n\n### Note\nThis dataset was automatically described using the [codebook R package](https://rubenarslan.github.io/codebook/) (version 0.9.2).",
  "datePublished": "2022-01-14",
  "creator": {
    "@type": "Person",
    "givenName": "Junyi",
    "familyName": "Chu",
    "email": "jchu@mit.edu",
    "affiliation": {
      "@type": "Organization",
      "name": "MIT"
    }
  },
  "keywords": ["studyID", "childID", "videoID", "n_coders", "videoFileName", "codingFile1", "codingFile2", "codingFile3", "child.ageSessionRounded", "child.gender", "parent.race.nonwhite", "video.privacy", "consentnotes", "usable", "video.nTrialsExpected", "video.nTrialsFound", "video.expectedDuration", "video.actualDuration", "which.dataset"],
  "@context": "http://schema.org/",
  "@type": "Dataset",
  "variableMeasured": [
    {
      "name": "studyID",
      "description": "Study version identifier",
      "@type": "propertyValue"
    },
    {
      "name": "childID",
      "description": "Child identifier",
      "@type": "propertyValue"
    },
    {
      "name": "videoID",
      "description": "Video identifier",
      "@type": "propertyValue"
    },
    {
      "name": "n_coders",
      "description": "Number of human annotations available for the session",
      "@type": "propertyValue"
    },
    {
      "name": "videoFileName",
      "description": "Video file name",
      "@type": "propertyValue"
    },
    {
      "name": "codingFile1",
      "description": "File name of annotation by primary coder",
      "@type": "propertyValue"
    },
    {
      "name": "codingFile2",
      "description": "File name of annotation by second coder",
      "@type": "propertyValue"
    },
    {
      "name": "codingFile3",
      "description": "File name of annotation by third coder",
      "@type": "propertyValue"
    },
    {
      "name": "child.ageSessionRounded",
      "description": "Child age in months (floor of exact age)",
      "@type": "propertyValue"
    },
    {
      "name": "child.gender",
      "description": "Child gender",
      "@type": "propertyValue"
    },
    {
      "name": "parent.race.nonwhite",
      "description": "Whether parent reported a non-white race",
      "@type": "propertyValue"
    },
    {
      "name": "video.privacy",
      "description": "Video sharing and storage permissions",
      "@type": "propertyValue"
    },
    {
      "name": "consentnotes",
      "description": "Any notes from parent consent process",
      "@type": "propertyValue"
    },
    {
      "name": "usable",
      "description": "Any notes from video download process on video quality",
      "@type": "propertyValue"
    },
    {
      "name": "video.nTrialsExpected",
      "description": "Number of trials expected",
      "@type": "propertyValue"
    },
    {
      "name": "video.nTrialsFound",
      "description": "Number of trials in video",
      "@type": "propertyValue"
    },
    {
      "name": "video.expectedDuration",
      "description": "Expected video duration in seconds",
      "@type": "propertyValue"
    },
    {
      "name": "video.actualDuration",
      "description": "Actual video duration in seconds",
      "@type": "propertyValue"
    },
    {
      "name": "which.dataset",
      "description": "Specifies video assignment to training,validation and test sets",
      "@type": "propertyValue"
    }
  ]
}`